newdata <- read.csv("new data.csv")
#data transformation: generate a new variable to measure the level of racial diversity
#Create a new dataset(data1) that includes generalized variety
#Have to change the raw number of White, Black, Hispanic to proportion of the whole population.
data1 <- newdata %>%
mutate(Black = BlackE/Totalpop) %>%
mutate(White = WhiteE/Totalpop) %>%
mutate(Hispanic = HispanicE/Totalpop) %>%
mutate(Racial_diversity=1-White^2-Black^2-Hispanic^2) %>%
mutate(norm_Racial_diversity = Racial_diversity*(3/2))
#Try to analysis the data in the single year (2017), which is the same year as the old dateset.
#Create a new dataset that contains only the data in the year of 2017 (data2)
data2 <- filter(data1,Year=="2017")
#Check the Cr plots for test model
fittest <- lm(Voe~Union+Urbanity+Unemployment+GDP_Per_Capita+Totalpop,data = data2)
summary(fittest)
##
## Call:
## lm(formula = Voe ~ Union + Urbanity + Unemployment + GDP_Per_Capita +
## Totalpop, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -217771 -52693 -7043 40849 659954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.106e+04 8.970e+04 0.458 0.649
## Union -1.304e+04 1.610e+05 -0.081 0.936
## Urbanity 6.237e+03 1.605e+05 0.039 0.969
## Unemployment -1.475e+04 2.028e+04 -0.727 0.471
## GDP_Per_Capita 8.505e-01 9.558e-01 0.890 0.378
## Totalpop 1.867e-02 2.517e-03 7.418 2.46e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 124700 on 45 degrees of freedom
## Multiple R-squared: 0.5563, Adjusted R-squared: 0.507
## F-statistic: 11.28 on 5 and 45 DF, p-value: 4.352e-07
par(mfrow=c(2,2))
plot(fittest)
## Warning: not plotting observations with leverage one:
## 45
## Warning: not plotting observations with leverage one:
## 45

crPlots(fittest)

#transformation
#fit1 seems to be the best with the least outliners and more valid assumptions
#Still, in the residuals versus leverage plot, there is a outlier that could impact the coefficient.
#Need to check out which state it is
fit1 <- lm(log(Voe)~Union+Urbanity+Unemployment+GDP_Per_Capita+Totalpop,data = data2)
crPlots(fit1)

summary(fit1)
##
## Call:
## lm(formula = log(Voe) ~ Union + Urbanity + Unemployment + GDP_Per_Capita +
## Totalpop, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8614 -0.6430 0.0000 0.5212 2.0161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.094e+01 6.951e-01 15.740 < 2e-16 ***
## Union 1.179e+00 1.247e+00 0.945 0.34974
## Urbanity -1.188e+00 1.243e+00 -0.956 0.34431
## Unemployment -1.082e-01 1.571e-01 -0.689 0.49452
## GDP_Per_Capita -2.109e-05 7.406e-06 -2.847 0.00662 **
## Totalpop 1.482e-07 1.951e-08 7.596 1.34e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9663 on 45 degrees of freedom
## Multiple R-squared: 0.6056, Adjusted R-squared: 0.5618
## F-statistic: 13.82 on 5 and 45 DF, p-value: 3.461e-08
par(mfrow=c(2,2))
plot(fit1)
## Warning: not plotting observations with leverage one:
## 45
## Warning: not plotting observations with leverage one:
## 45

fit2 <- lm(log(Voe)~Union+Urbanity+Unemployment+log(GDP_Per_Capita)+log(Totalpop),data=data2)
crPlots(fit2)

summary(fit2)
##
## Call:
## lm(formula = log(Voe) ~ Union + Urbanity + Unemployment + log(GDP_Per_Capita) +
## log(Totalpop), data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48200 -0.55114 -0.09603 0.61909 1.46037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2372 5.7329 -0.739 0.464
## Union 0.9387 1.0327 0.909 0.368
## Urbanity -0.9465 1.0292 -0.920 0.363
## Unemployment -0.2122 0.1278 -1.661 0.104
## log(GDP_Per_Capita) -0.2546 0.4945 -0.515 0.609
## log(Totalpop) 1.1973 0.1115 10.739 5.32e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7999 on 45 degrees of freedom
## Multiple R-squared: 0.7297, Adjusted R-squared: 0.6997
## F-statistic: 24.3 on 5 and 45 DF, p-value: 9.13e-12
par(mfrow=c(2,2))
plot(fit2)
## Warning: not plotting observations with leverage one:
## 45
## Warning: not plotting observations with leverage one:
## 45

#Analyze the data between the year of 2009 to the year of 2017
#Create a new dataset(data3) that excludes year between 2005 and 2008
#fit a linear regression without transformation
data3 <- filter(data1,Year>2008)
fit4 = lm(Voe ~ factor(Year)+Union+Urbanity+Unemployment+GDP_Per_Capita+Totalpop,data = data3)
summary(fit4)
##
## Call:
## lm(formula = Voe ~ factor(Year) + Union + Urbanity + Unemployment +
## GDP_Per_Capita + Totalpop, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -193257 -41159 -12379 28174 741459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.080e+05 2.820e+04 3.830 0.000147 ***
## factor(Year)2010 -1.338e+02 1.800e+04 -0.007 0.994071
## factor(Year)2011 -1.128e+04 1.802e+04 -0.626 0.531749
## factor(Year)2012 -2.144e+04 1.831e+04 -1.171 0.242042
## factor(Year)2013 -1.131e+04 1.810e+04 -0.625 0.532245
## factor(Year)2014 -3.100e+04 1.870e+04 -1.657 0.098137 .
## factor(Year)2015 -3.832e+04 1.956e+04 -1.959 0.050744 .
## factor(Year)2016 -5.013e+04 2.090e+04 -2.398 0.016878 *
## factor(Year)2017 -5.094e+04 2.169e+04 -2.349 0.019269 *
## Union -1.972e+03 3.133e+04 -0.063 0.949832
## Urbanity -2.937e+03 3.133e+04 -0.094 0.925358
## Unemployment -1.240e+04 2.794e+03 -4.437 1.15e-05 ***
## GDP_Per_Capita 4.152e-01 2.166e-01 1.917 0.055877 .
## Totalpop 1.534e-02 6.606e-04 23.225 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90780 on 445 degrees of freedom
## Multiple R-squared: 0.5567, Adjusted R-squared: 0.5437
## F-statistic: 42.98 on 13 and 445 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit4)

crPlots(fit4)

#Based on the Component residue plot, it seems all the varibales should take log transformation?
#Or should we choose the same variable as before to take log transformation?
#Should we include gini to the confound variables?
fit5 = lm(log(Voe) ~ factor(Year)+Union+Urbanity+Unemployment+log(GDP_Per_Capita)+log(Totalpop),data = data3)
summary(fit5)
##
## Call:
## lm(formula = log(Voe) ~ factor(Year) + Union + Urbanity + Unemployment +
## log(GDP_Per_Capita) + log(Totalpop), data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4891 -0.5981 -0.1319 0.6285 2.1767
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.26484 1.88185 -1.204 0.229417
## factor(Year)2010 0.04578 0.15960 0.287 0.774380
## factor(Year)2011 -0.18514 0.15976 -1.159 0.247148
## factor(Year)2012 -0.13545 0.16245 -0.834 0.404846
## factor(Year)2013 -0.14405 0.16050 -0.898 0.369935
## factor(Year)2014 -0.28449 0.16624 -1.711 0.087719 .
## factor(Year)2015 -0.28997 0.17445 -1.662 0.097173 .
## factor(Year)2016 -0.33791 0.18719 -1.805 0.071731 .
## factor(Year)2017 -0.38384 0.19459 -1.973 0.049161 *
## Union 0.16653 0.27765 0.600 0.548968
## Urbanity -0.18483 0.27766 -0.666 0.505983
## Unemployment -0.09824 0.02587 -3.797 0.000167 ***
## log(GDP_Per_Capita) -0.32691 0.15789 -2.070 0.038984 *
## log(Totalpop) 1.12068 0.04054 27.647 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8046 on 445 degrees of freedom
## Multiple R-squared: 0.6682, Adjusted R-squared: 0.6585
## F-statistic: 68.94 on 13 and 445 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit5)

crPlots(fit5)

data4 <- filter(data3, State!="District of Columbia")
fit6 = lm(log(Voe) ~ factor(Year)+Union+Urbanity+Unemployment+log(GDP_Per_Capita)+log(Totalpop),data = data4)
summary(fit6)
##
## Call:
## lm(formula = log(Voe) ~ factor(Year) + Union + Urbanity + Unemployment +
## log(GDP_Per_Capita) + log(Totalpop), data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6513 -0.6246 -0.0998 0.5012 2.2473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.279640 2.629621 -4.670 4.02e-06 ***
## factor(Year)2010 0.003009 0.157390 0.019 0.98475
## factor(Year)2011 -0.178230 0.157410 -1.132 0.25814
## factor(Year)2012 -0.082247 0.160213 -0.513 0.60796
## factor(Year)2013 -0.157984 0.158215 -0.999 0.31857
## factor(Year)2014 -0.211286 0.164241 -1.286 0.19897
## factor(Year)2015 -0.160555 0.173354 -0.926 0.35487
## factor(Year)2016 -0.119583 0.187738 -0.637 0.52448
## factor(Year)2017 -0.127238 0.195956 -0.649 0.51647
## Union 0.164985 0.270878 0.609 0.54279
## Urbanity -0.203854 0.270924 -0.752 0.45219
## Unemployment -0.022057 0.028919 -0.763 0.44605
## log(GDP_Per_Capita) 0.670670 0.242775 2.763 0.00598 **
## log(Totalpop) 1.045557 0.041929 24.936 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7848 on 436 degrees of freedom
## Multiple R-squared: 0.648, Adjusted R-squared: 0.6375
## F-statistic: 61.75 on 13 and 436 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit6)

crPlots(fit6)

#Add racial diversity
fit7 = lm(log(Voe) ~ factor(Year)+Union+Urbanity+Unemployment+log(GDP_Per_Capita)+log(Totalpop)+norm_Racial_diversity,data = data3)
summary(fit7)
##
## Call:
## lm(formula = log(Voe) ~ factor(Year) + Union + Urbanity + Unemployment +
## log(GDP_Per_Capita) + log(Totalpop) + norm_Racial_diversity,
## data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4913 -0.6040 -0.1289 0.6286 2.1776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.19154 2.06347 -1.062 0.288784
## factor(Year)2010 0.04576 0.15978 0.286 0.774683
## factor(Year)2011 -0.18563 0.16004 -1.160 0.246718
## factor(Year)2012 -0.13655 0.16312 -0.837 0.402979
## factor(Year)2013 -0.14468 0.16084 -0.900 0.368869
## factor(Year)2014 -0.28616 0.16753 -1.708 0.088307 .
## factor(Year)2015 -0.29236 0.17679 -1.654 0.098897 .
## factor(Year)2016 -0.34117 0.19111 -1.785 0.074916 .
## factor(Year)2017 -0.38751 0.19930 -1.944 0.052487 .
## Union 0.16635 0.27797 0.598 0.549858
## Urbanity -0.18463 0.27798 -0.664 0.506913
## Unemployment -0.09896 0.02721 -3.637 0.000308 ***
## log(GDP_Per_Capita) -0.33230 0.16974 -1.958 0.050896 .
## log(Totalpop) 1.11944 0.04301 26.025 < 2e-16 ***
## norm_Racial_diversity 0.01568 0.18006 0.087 0.930652
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8055 on 444 degrees of freedom
## Multiple R-squared: 0.6682, Adjusted R-squared: 0.6578
## F-statistic: 63.88 on 14 and 444 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit7)

crPlots(fit7)

#1 percentage point increase in unionization would increase the vocational training amount to exp(0.16635) -> 1.16635 -> 17% larger?
#Mediation Analysis
model.m <- lm(norm_Racial_diversity~Union+Urbanity+log(Unemployment)+log(GDP_Per_Capita)+log(Totalpop)+factor(Year),data=data3)
model.y <- lm(log(Voe)~Union+norm_Racial_diversity+Urbanity+log(Unemployment)+log(GDP_Per_Capita)+log(Totalpop)+factor(Year),data=data3)
out.1 <- mediate(model.m,model.y,sims=1000,treat = "Union",mediator = "norm_Racial_diversity")
summary(out.1)
##
## Causal Mediation Analysis
##
## Quasi-Bayesian Confidence Intervals
##
## Estimate 95% CI Lower 95% CI Upper p-value
## ACME 0.00192 -0.02447 0.04 0.92
## ADE 0.13230 -0.42702 0.67 0.63
## Total Effect 0.13423 -0.42138 0.68 0.63
## Prop. Mediated 0.00133 -0.35837 0.38 0.93
##
## Sample Size Used: 459
##
##
## Simulations: 1000
plot(out.1)

#In this case, we find the racial diversity does not have a mediation effect.